#load relevant libraries
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(ggplot2)
#Import data into a new data frame
MasterData<-read.csv("~/downloads/FRE1120 Data Summary - RAW.csv")
#Removing students that were not required to do the LS exercises
MasterData<-MasterData[MasterData$LS.Required=="y",]
#Removing the few students with linguistic background different than English, Spnaish, or Creole due to very low numbers
MasterData<-MasterData[MasterData$Linguistic.Background %in% c("English","Creole","Spanish"),]
#Creating the calculated fields of Per_aware and Per_correct
MasterData <- MasterData %>% mutate(Per_correct = Correct...aware+Correct...unaware)
MasterData <- MasterData %>% mutate(Per_aware = Correct...aware+Incorrect...aware)
#eliminating redundant columns
MasterData<- select(MasterData, "ID..","Sex","Linguistic.Background","Final.Grade","Incorrect...unaware","Per_correct","Per_aware","Time.Spent..HW.","Time.Spent..Pronunciation.Practice.","Time.Spent..LearnSmart.","Total.HW...Correct","Total.LS...Complete",)
#Changing column names
names(MasterData) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS")
#Examine scatter plots for different variable combination pairs
#Creating scatter plots for all pairs
pairs(MasterData)

#Time spent on Connect homework, Percentage of correct homework responses
ggplot(MasterData, aes(Time_hw,Per_correct_hw))+ geom_point() + stat_sum(aes(group = 1))

#Time spent on Connect homework, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(Time_hw,Time_LS))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of correct homework responses, Final course grade
ggplot(MasterData, aes(Per_correct_hw,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of correct LearnSmart responses, Percentage of awareness of correct/incorrect responses
ggplot(MasterData, aes(Per_correct,Per_aware))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of completion of assigned LearnSmart activities, Final course grade
ggplot(MasterData, aes(Per_complete_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of completion of assigned LearnSmart activities, Percentage of incorrect and unaware LearnSmart responses
ggplot(MasterData, aes(Per_complete_LS,IU))+ geom_point() + stat_sum(aes(group = 1))

#Time spent on LearnSmart adaptive activities, Final course grade
ggplot(MasterData, aes(Time_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Final course grade
ggplot(MasterData, aes(IU,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(IU,Time_LS))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Time spent on Connect homework
ggplot(MasterData, aes(IU,Time_hw))+ geom_point() + stat_sum(aes(group = 1))

#Create new data frame for scaled data, removing non-clustering variables
ScaledData<-as.data.frame(cbind(MasterData[,1:3],scale(select(MasterData, Final_grade:Per_complete_LS))))
#Validating scaling by looking at means and standard deviations of the scaled columns
sapply(ScaledData[,4:12], mean)
    Final_grade              IU     Per_correct       Per_aware         Time_hw       Time_pron         Time_LS 
   5.843135e-17   -1.133857e-16    2.872520e-16   -4.121555e-16   -3.776554e-17    5.052708e-17    8.279565e-17 
 Per_correct_hw Per_complete_LS 
   2.015534e-16    4.870925e-18 
sapply(ScaledData[,4:12], sd)
    Final_grade              IU     Per_correct       Per_aware         Time_hw       Time_pron         Time_LS 
              1               1               1               1               1               1               1 
 Per_correct_hw Per_complete_LS 
              1               1 
#Creating the Within Group Sum of Squares function
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}
#Create the WSS Graph
wssplot(ScaledData[,4:12],nc=15,seed=1234)

#Create and examine several different possible cluster solutions
threeclusterkmeans<-kmeans(ScaledData[,4:12], 3, nstart=10)
threeclusterkmeans
K-means clustering with 3 clusters of sizes 15, 68, 46

Cluster means:
  Final_grade         IU Per_correct  Per_aware     Time_hw   Time_pron    Time_LS Per_correct_hw Per_complete_LS
1 -1.66048356 -0.4683417   0.4338804  0.2462526 -1.34682486 -0.38218605 -0.7236692   -2.050320950      -2.0744039
2  0.35681501 -0.5420303   0.4596331  0.5090781  0.02322455 -0.03493209 -0.2628125    0.450550769       0.2919678
3  0.01399637  0.9539823  -0.8209404 -0.8328501  0.40485007  0.17626462  0.6244845    0.002551347       0.2448315

Clustering vector:
  [1] 3 2 2 2 1 3 3 3 2 1 3 1 2 1 3 2 2 2 1 2 2 3 2 3 1 1 2 2 3 2 2 2 1 2 3 2 1 3 3 1 3 3 3 3 3 3 2 2 2 2 3 3 3 2 2 3 3 3 2 1 2
 [62] 2 3 2 2 3 2 2 1 2 2 2 2 2 3 3 3 2 2 3 2 2 2 3 2 2 3 2 1 3 3 3 2 2 1 2 2 2 3 2 2 2 3 3 2 2 2 2 2 2 3 1 3 2 2 3 3 2 2 2 2 2
[123] 3 3 3 2 2 2 3

Within cluster sum of squares by cluster:
[1] 108.0055 317.0107 294.2538
 (between_SS / total_SS =  37.6 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"         "iter"        
[9] "ifault"      
fourclusterkmeans<-kmeans(ScaledData[,4:12], 4, nstart=10)
fourclusterkmeans
K-means clustering with 4 clusters of sizes 19, 15, 37, 58

Cluster means:
  Final_grade         IU Per_correct  Per_aware     Time_hw  Time_pron    Time_LS Per_correct_hw Per_complete_LS
1  -0.3507149  1.1769446  -0.9840832 -1.1188646  0.87451648  1.0314872  1.5405665     -0.1017225       0.1292924
2  -1.6964014 -0.6400324   0.4608217  0.2805430 -1.33097412 -0.3821861 -0.7669705     -1.8427475      -2.1677251
3   0.5572135 -0.7840533   0.9818083  0.8836826  0.07144945  0.2390765 -0.2894854      0.5519276       0.4797214
4   0.1981501  0.2801468  -0.4231319 -0.2697582  0.01215809 -0.3915741 -0.1216422      0.1578038       0.2122350

Clustering vector:
  [1] 1 4 4 4 2 4 4 4 2 2 4 2 3 2 1 3 3 3 2 4 3 4 4 4 2 2 4 3 1 3 3 4 2 4 1 3 2 1 1 2 1 1 1 1 1 4 3 4 3 3 4 4 4 4 3 1 4 4 4 2 3
 [62] 3 1 4 4 4 3 4 4 3 4 3 3 3 4 1 4 4 4 4 3 4 3 4 4 4 1 4 2 4 4 4 4 3 2 3 4 3 1 3 3 3 4 4 4 4 4 4 3 3 1 2 4 4 4 4 1 3 3 3 3 3
[123] 1 4 4 4 3 3 4

Within cluster sum of squares by cluster:
[1] 162.9807 111.4874 159.6486 194.7688
 (between_SS / total_SS =  45.4 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"         "iter"        
[9] "ifault"      
#Assign the clusters for each observation for k=3,4 to a new dataframe
Clusters<-data.frame(ScaledData, as.factor(threeclusterkmeans$cluster), as.factor(fourclusterkmeans$cluster))
names(Clusters) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS","ClusterToThree","ClusterToFour")
#Graph the different solutions - Three Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language"
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(IU,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(IU,Time_LS, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(IU,Time_hw, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

#Graph the different solutions - Four Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(IU,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(IU,Time_LS, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(IU,Time_hw, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

for (j in langs) {
    LangsCount[j,1] <- j
    for (i in 1:4){
      LangsCount[j,i] <- sum(UnscaledClusters$Language==j & UnscaledClusters$ClusterToFour==i)
}
}
invalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generated
---
title: "R Notebook"
output: html_notebook
---
  
```{r}
#load relevant libraries
library(dplyr)
library(ggplot2)

#Import data into a new data frame
MasterData<-read.csv("~/downloads/FRE1120 Data Summary - RAW.csv")
#Removing students that were not required to do the LS exercises
MasterData<-MasterData[MasterData$LS.Required=="y",]
#Removing the few students with linguistic background different than English, Spnaish, or Creole due to very low numbers
MasterData<-MasterData[MasterData$Linguistic.Background %in% c("English","Creole","Spanish"),]
#Creating the calculated fields of Per_aware and Per_correct
MasterData <- MasterData %>% mutate(Per_correct = Correct...aware+Correct...unaware)
MasterData <- MasterData %>% mutate(Per_aware = Correct...aware+Incorrect...aware)
#eliminating redundant columns
MasterData<- select(MasterData, "ID..","Sex","Linguistic.Background","Final.Grade","Incorrect...unaware","Per_correct","Per_aware","Time.Spent..HW.","Time.Spent..Pronunciation.Practice.","Time.Spent..LearnSmart.","Total.HW...Correct","Total.LS...Complete",)
#Changing column names
names(MasterData) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS")

```

```{r}
#Examine scatter plots for different variable combination pairs
#Creating scatter plots for all pairs
pairs(MasterData)
#Time spent on Connect homework, Percentage of correct homework responses
ggplot(MasterData, aes(Time_hw,Per_correct_hw))+ geom_point() + stat_sum(aes(group = 1))
#Time spent on Connect homework, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(Time_hw,Time_LS))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of correct homework responses, Final course grade
ggplot(MasterData, aes(Per_correct_hw,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of correct LearnSmart responses, Percentage of awareness of correct/incorrect responses
ggplot(MasterData, aes(Per_correct,Per_aware))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of completion of assigned LearnSmart activities, Final course grade
ggplot(MasterData, aes(Per_complete_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of completion of assigned LearnSmart activities, Percentage of incorrect and unaware LearnSmart responses
ggplot(MasterData, aes(Per_complete_LS,IU))+ geom_point() + stat_sum(aes(group = 1))
#Time spent on LearnSmart adaptive activities, Final course grade
ggplot(MasterData, aes(Time_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Final course grade
ggplot(MasterData, aes(IU,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(IU,Time_LS))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Time spent on Connect homework
ggplot(MasterData, aes(IU,Time_hw))+ geom_point() + stat_sum(aes(group = 1))
```

```{r}
#Create new data frame for scaled data, removing non-clustering variables
ScaledData<-as.data.frame(cbind(MasterData[,1:3],scale(select(MasterData, Final_grade:Per_complete_LS))))
#Validating scaling by looking at means and standard deviations of the scaled columns
sapply(ScaledData[,4:12], mean)
sapply(ScaledData[,4:12], sd)
```

```{r}
#Creating the Within Group Sum of Squares function
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

#Create the WSS Graph
wssplot(ScaledData[,4:12],nc=15,seed=1234)
```
```{r}
#Create and examine several different possible cluster solutions
threeclusterkmeans<-kmeans(ScaledData[,4:12], 3, nstart=10)
threeclusterkmeans
fourclusterkmeans<-kmeans(ScaledData[,4:12], 4, nstart=10)
fourclusterkmeans
```
```{r}
#Assign the clusters for each observation for k=3,4 to a new dataframe
Clusters<-data.frame(ScaledData, as.factor(threeclusterkmeans$cluster), as.factor(fourclusterkmeans$cluster))
names(Clusters) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS","ClusterToThree","ClusterToFour")
```

```{r}
#Graph the different solutions - Three Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language"
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
```

```{r}
#Graph the different solutions - Four Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
```



```{r}
#Descriptive stats for each cluster in the 4 cluster solution
UnscaledClusters<-data.frame(MasterData, as.factor(threeclusterkmeans$cluster), as.factor(fourclusterkmeans$cluster))
names(UnscaledClusters) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS","ClusterToThree","ClusterToFour")
#creating languages vector and language count df
langs <- c("English","Spanish","Creole")
LangsCount <- data.frame(cluster1count=integer(), cluster2count=integer(), cluster3count=integer(), cluster4count=integer(), stringsAsFactors=TRUE)

for (j in langs) {
    LangsCount[j,1] <- j
    for (i in 1:4){
      LangsCount[j,i] <- sum(UnscaledClusters$Language==j & UnscaledClusters$ClusterToFour==i)
}
}
```



